import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
crop_df = pd.read_csv('Crop_recommendation.csv')
crop_df.head()
Dataset contains 2200 records and 8 columns.
crop_df.isnull().sum()
crop_df['no_label'] = pd.Categorical(crop_df.label).codes
Exploratory Data Analysis :
1]Nitrogen :
plt.figure(figsize=(8,7))
sns.histplot(x='N',data=crop_df,color='b');
plt.title("Nitrogen for crops",{'fontsize':20});
Importance of Nitrogen :
Nitrogen is an essential nutrient for plant growth, development and reproduction.Soil nitrogen exists in three general forms: organic nitrogen compounds, ammonium (NH₄⁺) ions and nitrate (NO₃⁻) ions.
2] Potassium
plt.figure(figsize=(8,7))
sns.histplot(x='K',data=crop_df,color='b');
plt.title("Potassium for crops",{'fontsize':20});
plt.figure(figsize=(8,7))
sns.histplot(x='P',data=crop_df,color='b');
plt.title("Phosphorus for crops",{'fontsize':20});
plt.figure(figsize=(10,6))
sns.boxplot(x=crop_df.temperature);
5]Humidity
plt.figure(figsize=(10,6))
sns.boxplot(x=crop_df.humidity);
6]PH
plt.figure(figsize=(8,7))
sns.histplot(x='ph',data=crop_df,color='b');
plt.title("PH for crops",{'fontsize':20});
pH stands for ‘potential of hydrogen’ and refers to the amount of hydrogen found in the soil.
Importance of PH to plants :
pH can affect a plant’s ability to absorb vital nutrients from the soil. If pH is too acidic or alkaline, this can stunt or retard root growth and consequently, restrict water and nutrient uptake.
plt.figure(figsize=(8,7))
sns.histplot(x='rainfall',data=crop_df,color='b');
plt.title("Rainfall feature",{'fontsize':20});
Split data
X = crop_df.drop(['label','no_label'],axis=1)
y = crop_df.no_label
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)
models = {
LogisticRegression(max_iter=500):'Logistic Regression',
RandomForestClassifier():'Random Forest',
SVC():'Support Vector Machine'
}
for m in models.keys():
m.fit(X_train,y_train)
for model,name in models.items():
print(f"Accuracy Score for {name} is : ",model.score(X_test,y_test)*100,"%")
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test,y_pred))
class_names = np.arange(0,21)
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)
cnf_matrix = confusion_matrix(y_test,y_pred)
sns.heatmap(pd.DataFrame(cnf_matrix), annot = True,fmt = 'd')
ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title(f'Confusion Matrix for Random Forest', {'fontsize':20})
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()
Thank You ):